Zillow:
Zillow is an online real estate database company founded in 2006 - Wikipedia
Zestimate:
“Zestimates” are estimated home values based on 7.5 million statistical and machine learning models that analyze hundreds of data points on each property. And, by continually improving the median margin of error (from 14% at the onset to 5% today),
Objective:
Building a model to improve the Zestimate residual error.
The competition is in two stages. This public competition will go on till Jan 2018 and has $50,000 in prize. Please make sure to read about the Prize details and Competition overview since it is quite different in this one.
Let us first import the necessary modules.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import folium
import missingno as msno
%matplotlib inline
org=pd.read_csv('zillow_train_data.csv')
print (org.info())
missing_df = org.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
ind = np.arange(missing_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(12,18))
rects = ax.barh(ind, missing_df.missing_count.values, color='blue')
ax.set_yticks(ind)
ax.set_yticklabels(missing_df.column_name.values, rotation='horizontal')
ax.set_xlabel("Count of missing values")
ax.set_title("Number of missing values in each column")
plt.show()
missingValueColumns = org.columns[org.isnull().any()].tolist()
msno.bar(org[missingValueColumns],\
figsize=(20,8),color="#34495e",fontsize=12,labels=True,)
msno.matrix(org[missingValueColumns],width_ratios=(10,1),\
figsize=(20,8),color=(0.25,0.1, 0),fontsize=12,sparkline=True,labels=True)
plt.savefig('Datacompleteness.png')
plt.show()
ulimit = np.percentile(df_train.logerror.values, 99)
llimit = np.percentile(df_train.logerror.values, 1)
df_train['logerror'].loc[df_train['logerror']>ulimit] = ulimit
df_train['logerror'].loc[df_train['logerror']<llimit] = llimit
plt.figure(figsize=(12,8))
sns.distplot(df_train.logerror.values, bins=50, kde=False);
plt.title('Distribution of logerror')
plt.xlabel('logerror', fontsize=12)
plt.axvline(x=0.05,color='k', linestyle='--')
plt.axvline(x=-0.036,color='k', linestyle='--')
plt.axvline(x=0.094,color='g', linestyle='-.')
plt.axvline(x=-0.074,color='g', linestyle='-.')
plt.savefig('Logerrordistribution.png')
plt.show()
plt.figure(figsize=(8,6))
plt.scatter(range(org.shape[0]), np.sort(org.logerror.values),c=org['errrange'],cmap='jet',alpha=0.2)
plt.xlabel('index', fontsize=12)
plt.ylabel('logerror', fontsize=12)
plt.show()
continuous = ['basementsqft', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
'finishedsquarefeet50', 'finishedsquarefeet6', 'garagetotalsqft', 'latitude',
'longitude', 'lotsizesquarefeet', 'poolsizesum', 'yardbuildingsqft17',
'yardbuildingsqft26', 'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
'landtaxvaluedollarcnt', 'taxamount']
discrete = ['bathroomcnt', 'bedroomcnt', 'calculatedbathnbr', 'fireplacecnt', 'fullbathcnt',
'garagecarcnt', 'poolcnt', 'roomcnt', 'threequarterbathnbr', 'unitcnt',
'numberofstories', 'assessmentyear', 'taxdelinquencyyear']
df_train=org
### Continuous variable plots
for col in continuous:
xx=df_train[df_train[col].notnull()][['logerror',col]]
fig = plt.figure(figsize=(12,6));
sns.distplot(xx[col], color='Sienna', ax = plt.subplot(121));
sns.regplot(xx[col], y=xx['logerror'],color='blue', ax = plt.subplot(122));
plt.suptitle(col, fontsize=16)
del xx
### Discrete variable plots
for col in discrete:
org[col].fillna(-1, inplace=True)
values = org[col].dropna()
fig = plt.figure(figsize=(9,6));
sns.countplot(x=values, color='Sienna', ax = plt.subplot(121));
sns.boxplot(x=org[col],y=org['logerror'], color='blue', ax = plt.subplot(122));
plt.suptitle(col, fontsize=16)
from ggplot import *
ggplot(aes(x='latitude', y='longitude', color='logerror'), df_train[df_train['errrange']==1]) + \
geom_point(alpha=0.1)+scale_color_gradient(low = 'white', high = 'red')+geom_jitter()
ggplot(aes(x='latitude', y='longitude', color='logerror'), df_train[df_train['errrange']==2]) + \
geom_point(alpha=0.1)+scale_color_gradient(low = 'white', high = 'red')+geom_jitter()
df_train=org
ggplot(aes(x='latitude', y='longitude', color='logerror'), df_train[df_train['errrange']==3]) + \
geom_point(alpha=0.1)+scale_color_gradient(low = 'white', high = 'red')+geom_jitter()
from folium.plugins import MarkerCluster
geo_df_1=org[['latitude', 'longitude','logerror','errrange']][org['errrange']==1]
geo_df_1[['longitude']]/=1e6
geo_df_1[['latitude']]/=1e6
geo_df_1.dropna(subset=['latitude','longitude'], axis=0 ,inplace=True)
SF_COORDINATES=(34.088537, -118.249923)
# create empty map zoomed in
map_1 = folium.Map(location=SF_COORDINATES, zoom_start=9)
marker_cluster = MarkerCluster().add_to(map_1)
MAX_RECORDS = 1000
# add a marker for every record
for each in geo_df_1[0:MAX_RECORDS].iterrows():
folium.Marker([each[1]['latitude'],each[1]['longitude']],icon=folium.Icon(color='green')).add_to(marker_cluster)
display(map_1)
del geo_df_1
del map_1
geo_df_2=org[['latitude', 'longitude','logerror','errrange']][org['errrange']==2]
geo_df_2[['longitude']]/=1e6
geo_df_2[['latitude']]/=1e6
SF_COORDINATES=(34.088537, -118.249923)
# create empty map zoomed in
map_2 = folium.Map(location=SF_COORDINATES, zoom_start=9)
marker_cluster = MarkerCluster().add_to(map_2)
MAX_RECORDS = 1000
# add a marker for every record
for each in geo_df_2[0:MAX_RECORDS].iterrows():
folium.Marker([each[1]['latitude'],each[1]['longitude']],icon=folium.Icon(color='red')).add_to(marker_cluster)
display(map_2)
del geo_df_2
del map_2
geo_df_3=org[['latitude', 'longitude','logerror','errrange']][org['errrange']==3]
geo_df_3[['longitude']]/=1e6
geo_df_3[['latitude']]/=1e6
SF_COORDINATES=(34.088537, -118.249923)
# create empty map zoomed in
map_3 = folium.Map(location=SF_COORDINATES, zoom_start=9)
marker_cluster = MarkerCluster().add_to(map_3)
MAX_RECORDS = 1000
# add a marker for every record
for each in geo_df_3[0:MAX_RECORDS].iterrows():
folium.Marker([each[1]['latitude'],each[1]['longitude']],icon=folium.Icon(color='red')).add_to(marker_cluster)
display(map_3)
del geo_df_3
del map_3
df_train=org
for i,col in enumerate(df_train.columns):
nanValues = np.sum(df_train[col].isnull()) / org.shape[0]
if nanValues > .75:
df_train = df_train.drop(col, axis=1)
for col in df_train.columns:
if df_train[col].dtype == 'object':
df_train=df_train.drop(col,axis=1)
corr = df_train.corr()
low = np.tril(corr, k = -1)
new = corr.columns.tolist()
new.reverse()
new
plt.figure(figsize=(12,12))
sns.heatmap(low);
plt.xticks(np.arange(len(low)), corr.columns, rotation = '90')
plt.yticks(np.arange(len(low)), new, rotation = '0')
plt.show()
thresh = low > 0.7
plt.figure(figsize=(9,9))
sns.heatmap(low, mask = 1 - thresh)
plt.xticks(np.arange(len(low)), corr.columns, rotation = '90');
plt.yticks(np.arange(len(low)), new, rotation = '0');
thresh = low < -0.5
plt.figure(figsize=(9,9))
sns.heatmap(low, mask = 1 - thresh)
plt.xticks(np.arange(len(low)), corr.columns, rotation = '90');
plt.yticks(np.arange(len(low)), new, rotation = '0');
from ggplot import *;
g = sns.FacetGrid(df_train, col = 'errrange',col_wrap=3,size=6)
plt.xticks(rotation=45)
g.map(sns.countplot, 'buildingqualitytypeid')
plt.tight_layout()
sns.regplot(x='buildingqualitytypeid',y='logerror',data=df_train);
sns.set(style="darkgrid")
g = sns.FacetGrid(df_train, col = 'errrange',size=4)
g.map(sns.kdeplot,"yearbuilt", "buildingqualitytypeid",n_levels=15)
plt.tight_layout()
sns.set(style="darkgrid")
g = sns.FacetGrid(df_train, col = 'errrange',size=4)
g.map(sns.kdeplot,"lotsizesquarefeet", "finishedsquarefeet12",n_levels=15)
for ax in g.axes.flat:
for label in ax.get_xticklabels():
label.set_rotation(90)
plt.tight_layout()
ratio1=df_train['lotsizesquarefeet']/df_train['finishedsquarefeet12']
sns.regplot(x=ratio1,y=df_train['logerror']);
ratio2=df_train['finishedsquarefeet12']/df_train['bedroomcnt']
sns.regplot(x=ratio2,y=df_train['logerror']);
totroom=(df_train['bedroomcnt']+df_train['calculatedbathnbr'])
sns.regplot(x=totroom,y=ratio1);
sns.regplot(x='taxamount',y='logerror',data=df_train)
ratio4=df_train['taxamount']/df_train['lotsizesquarefeet']
sns.regplot(x=ratio4,y='logerror',data=df_train)
ratio5=df_train['taxamount']/df_train['finishedsquarefeet12']
sns.regplot(x=ratio5,y='logerror',data=df_train)
sns.interactplot(x1='bedroomcnt', x2='bathroomcnt',y='logerror',data=df_train);
from sklearn import model_selection, preprocessing
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")
mergedFilterd = org.fillna(-999)
for f in mergedFilterd.columns:
if mergedFilterd[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(mergedFilterd[f].values))
mergedFilterd[f] = lbl.transform(list(mergedFilterd[f].values))
train_y = mergedFilterd.logerror.values
train_X = mergedFilterd.drop(["parcelid", "transactiondate", "logerror","errrange"], axis=1)
xgb_params = {
'eta': 0.05,
'max_depth': 8,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
dtrain = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=100)
featureImportance = model.get_fscore()
features = pd.DataFrame()
features['features'] = featureImportance.keys()
features['importance'] = featureImportance.values()
features.sort_values(by=['importance'],ascending=False,inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
plt.xticks(rotation=90)
sns.barplot(data=features.head(15),x="importance",y="features",ax=ax,orient="h",color="#34495e");
plt.savefig('Top 15 features.png')